# !diagnostics off
invisible(utils::memory.limit(750000))



############################################
# Set the working directory to the location where you have stored the replication files

setwd("E:/Dropbox/Research/pill/PillUptakeReplication")
# setwd("/home/randy/Dropbox/Research/pill")

############################################

library(tidyverse)
library(lubridate)

############## OPR
# data file from OPR that contains geographic identifiers
nfsus = readLines("nfs70.dat")
# One person has an invalid Location Area Number string (LAN of "86 02" for id "3737"), 
# but the surrounding observations make it clear what the value should be ("86002")
nfsus = str_replace(nfsus, "37371186 020130", "373711860020130")

# Function to read in fixed-width file
read_colindex = function(data, starts, names, lastchar){
  nCols = length(starts)
  if(missing(lastchar)){
    lastchar = str_length(data[1])
  }
  ends = c((dplyr::lead(starts)-1)[1:(nCols-1)], lastchar)
  dataList = 
    data %>%
    map(str_sub, start=starts, end=ends) %>%
    unlist
  emptyStrings = str_detect(dataList, "^[[:space:]]*$")
  dataList[emptyStrings] = NA
  dataList = as.integer(dataList)
  if(missing(names)){
    dataOut = 
      dataList %>%
      matrix(nrow=length(data), byrow=T) %>%
      as_tibble
  }else{
    dataOut = 
      dataList %>%
      matrix(nrow=length(data), byrow=T, dimnames = list(1:length(data), names)) %>%
      as_tibble
  }
  return(dataOut)
}

# Start positions for variables in the NFS file
# Does not include all pregnancy periods
nfs_starts = as.integer(c(1,5,6,7,12,14,16,17,18,21,25,28,29,31,32,
               39:52,55,56,59:79,81,83:86, 
               87,90,91,
               93,96,97,
               99,102,103,
               105:107,
               108,111,112,115,118,
               121,124,125,128,131,
               134,137,138,141,144,
               147,150,151,154,157,
               160,162,165,
               166,167,170,173,176,177,180,181,184,185,188, 
               189,192,193,195,196,198,199,202,204,205,
               207:210,213,214,216,217,219,220,223:227,230:233,
               236:238,241,244,245,248:251,253,255,257,259:261,
               279,281:283,285,286,288,290,292,294,295,296,298,300,
               304,306,308:312,314,316:319,321,324,325,327:333,
               336:398,400,401,411,412,416,417,
               419,420,464:468,472:474,476,477,479,481,483:485,491,
               497,503,504,506,508,510,511,513,515,517,520,523,
               526,528,530,534:537,539,541,543,544,546,549,552:554,558:565,568,
               569:571,574:577,580,581,584,586:588,591:598,601,603,605,607,610,
               613,614,617,619,621,623,625,627,629:635,638,640:647,
               648:670,
               671:686,688:693,695:697,700,703,
               706,709,712,715,718,721,724,727,730,733,736,739,742,745,
               748,751,754,757,760,763,766,769,771,773,775,776,779,782,
               785,786,787,788,790,792,794,796,798,800,802,816,819,
               820,823,825,828,830,832,834,836,839,840,842,844,846,848,850,
               852,853,855,858,861,863,864,865,866,868,869,870,871,
               872,873,875,878,881,884,887,890,893,894,896,897,
               900,901,902,904,906,907,908))

# Variable names
# Some variables were left out due to irrelevance
nfs_names = c(
  # General
  "id","Marr","NumEligibleF", "LAN", "Chunk", "DU", "DUX", "NumInHH", 
  "IntDuration","InterviewerID","IntDate","Cooperation","OthersPresent",
  "Quality","ReasonForQ",
  # Residence history
  "FarmNow", "FarmAlways", "FarmEver", "Farm0616", "State0616",
  "BothParents14", "ParentDied",
  "HState0616", "HFarm0616", "HBothParents14", "HParentDied",
  "RaceNeighborhood", "FarmHistory",
  # Age and race
  "DOB", "Age", "HDOB", "HAge", "Race", 
  # Family
  "BrosAny", "BrosOlder", "BrosYounger", "BrosAnyTwin",
  "SisAny", "SisOlder", "SisYounger", "SisAnyTwin",
  "HBrosAny", "HBrosOlder", "HBrosYounger", "HBrosAnyTwin",
  "HSisAny", "HSisOlder", "HSisYounger", "HSisAnyTwin",
  # Pregnancies, abortions, miscarriages: general
  "BabyEver", "PregNow", "PregNPast", "BirthsNLive",
  "MiscarriageEver", "MiscarriageN", 
  "AbEver", "AbN", 
  "Ab1Date", "Ab1FamPlan", "Ab1PregNum", 
  "Ab2Date", "Ab2FamPlan", "Ab2PregNum", 
  "Ab3Date", "Ab3FamPlan", "Ab3PregNum",
  # Marriage history
  "MarrIs1", "HMarrIs1", "MarrN",
  "Marr1PreBegan", "Marr1PreEndHow", "Marr1PreDeathDate", "Marr1PreDivDate", "Marr1PreDivDateSep",
  "Marr2PreBegan", "Marr2PreEndHow", "Marr2PreDeathDate", "Marr2PreDivDate", "Marr2PreDivDateSep",
  "Marr3PreBegan", "Marr3PreEndHow", "Marr3PreDeathDate", "Marr3PreDivDate", "Marr3PreDivDateSep",
  "Marr4PreBegan", "Marr4PreEndHow", "Marr4PreDeathDate", "Marr4PreDivDate", "Marr4PreDivDateSep",
  # Most recent/present marriage
  "HAgeMarr1", "MarrBegan", "MarrDuration", "WidowDivSep", "HDeathDate", "DivDate", "SepDate",
  "DivLivedRelAfterSep", "DivLivedRelAfterSepDuration", "DivWillMarryAgain", 
  "SepHowLongAgo",
  "SepLivedRelAfterSep", "SepLivedRelAfterSepDuration", "SepWillGetTogetherAgain", 
  # Respondent's education
  "NGradesCompletedAtMarr1", "HSCollSinceMarr", "HighestGradeCompleted",
  "HSCollAttending", "AttendedHSCollYearLast", "WillAttendSchoolAgain", "WhenWillAttendSchoolAgain",
  "HighestGradeHopeToComplete", "SchoolOtherAfterMarr", "SchoolOtherAfterMarrWhen", 
  "OtherSchoolingIntended", "SchoolOtherBeforeMarr", "TotalEd", 
  # Husband's education
  paste0("HEd", 117:124),
  # "HEd1", "HEd2", "HEd3", "HEd4", "HEd5", "HEd6", "HEd7", "HEd8"
  # Employment and income
  paste0("EmpInc", 125:162),
  # Religion: respondent
  "RelPrefGrowingUp", "RelPrefNow", "CommunionFrequency", "RelImportance", "Rel", "PolViews",
  # Religion: husband
  "HRelPrefGrowingUp", "HRelPrefNow", "HCommunionFrequency", "HRelImportance", "HRel",
  # First husband
  "H1RelPref", "H1NGradesCompletedAtMarr", "H1HSCollSinceMarr", "H1AgeAtMarr", "H1MarrBefore",
  # Family planning clinic
  "FPCEver", "FPCLast12Mo", "DiscussedFPEver", "DiscussedFPLast12Mo", "DiscussedFPAtMarr", "DiscussedFPLastDate",
  # Pill
  "PillFirstFromFPC", "PillFirstFromDoc", "PillAskedFor", "PillProlemsDiscussed",
  # IUD
  "IUDFirstFromFPC", "IUDFirstFromDoc", "IUDAskedFor", "IUDProlemsDiscussed", "IUDCheck",
  # Diaphragm
  "DiaphragmFirstFromFPC", "DiaphragmFirstFromDoc", "DiaphragmAskedFor",
  # Pill use
  "PillEver", 
  # "PillWhen", 
  paste0("PillY",rep(60:70, each=4),"Q",rep(1:4, length(60:70))),
  "PillNow", "PillStoppedForProbs", "PillStoppedForWorry", 
  "PillForContra", "PillForMedicalOnly", "PillBrandLast", "PillBrandAnyOther", "PillBrandOthers",
  "PillFuture", "PillFutureNotReasons", "PillNowReasons", "PillStoppedReasons",
  # IUD use
  "IUDEver", "IUDWhen", "IUDNow", "IUDStoppedForProbs", 
  "IUDFuture", "IUDStoppedReasons", "IUDFutureNotReasons",
  # Diaphragm
  "DiaphragmEver"
)
if(length(nfs_names)<length(nfs_starts)){
  nfs_names = c(nfs_names, nfs_starts[(length(nfs_names)+1):length(nfs_starts)])
}

# Separate the data into columns
nfs = read_colindex(nfsus, nfs_starts, names=nfs_names, lastchar=908)


# Location Codes from Appendix A in NFS documentation
# Page 1
states = 
  list(
    CT = c(13001:13013),
    NY = c(21001:21078, 23001:23013),
    PA = c(21079:21091, 21099, 21101, 21103, 21104),
    DC = c(21092:21098, 21100, 21102),
    NJ = c(21105:21117),
    IL = 31001:31039,
    MI = c(31040:31052, 33014:33026, 33027:33039, 33040:33052),
    OH = c(31053:31065, 33001:33013),
    MN = c(41001:41013),
    MO = c(43001:43013),
    MD = c(51001:51013),
    GA = c(51014:51026),
    # Page 2
    VA = c(53001:53026),
    TN = c(63001:63013),
    AL = c(63014:63026),
    TX = c(71001:71013, 73001:73013, 73027:73039),
    OK = c(73014:73026),
    AZ = c(83001:83013),
    NV = c(83014, 83020, 83021, 83024, 83025),
    UT = c(83015:83019, 83022, 83023, 83026),
    CA = c(91001:91052, 93014:93026),
    WA = c(93001:93013),
    # Page 3 (may cross state lines)
    MA = c(12001:12026, 14014:14026),
    CT = c(14001:14013),
    NY = c(22001:22026, 24001:24013),
    PA = c(22027:22039, 22053:22065),
    NJ = c(22040:22052, 22066:22078),
    IL = c(32001:32026),
    OH = c(32027:32052, 34001:34026),
    WI = c(34027:34039),
    MN = c(42001:42013),
    KS = c(42014:42026, 44003, 44004, 44006:44008, 44011:44013),
    NE = c(44002),
    MO = c(44001, 44005, 44009, 44010),
    MD = c(52001:52013),
    # Page 4
    GA = c(52014:52026),
    DE = c(54001),
    NJ = c(54002:54013),
    FL = c(54014:54026),
    VA = c(54027:54052),
    TN = c(64001:64013),
    OK = c(74001:74013),
    AZ = c(84001:84013),
    UT = c(84014:84026),
    CA = c(92001:92065, 94001:94039),
    # Page 5
    ME = c(15001:15026),
    VT = c(16001:16013),
    NH = c(16014:16026),
    NJ = c(25001:25013),
    PA = c(25014:25026),
    NY = c(26001:26013),
    PA = c(26014:26019,26021, 26023, 26026),
    NJ = c(26020,26022,26024,26025),
    OH = c(35001:35013),
    IN = c(35014:35026),
    IL = c(35027:35039),
    # Page 6
    MI = c(36001:36006, 36014:36026),
    IN = c(36027:36039),
    WI = c(36040:36052),
    IL = c(36053:36065),
    IA = c(45001:45013),
    ND = c(45014:45026),
    KS = c(45027:45039),
    MN = c(46001:46013),
    IA = c(46014:46026),
    # Page 7
    SD = c(46027:46029, 46031, 46034:46036, 46038, 46039),
    ND = c(46030, 46032, 46033, 46037),
    KS = c(46040, 46046, 46047),
    NE = c(46041:46045, 46048:46052),
    NC = c(55001:55013),
    FL = c(55014:55039),
    MD = c(56001:56013),
    WV = c(56014:56026),
    NC = c(56027:56039),
    SC = c(56040:56052),
    GA = c(56053:56065),
    FL = c(56066:56078),
    KY = c(65001:65013, 66001:66013),
    # Page 8
    TN = c(66014:66026),
    MS = c(66027:66039),
    AR = c(75001:75013),
    TX = c(75014:75026),
    LA = c(76001:76013),
    OK = c(76014:76026),
    TX = c(76027:76039),
    NM = c(85001:85013),
    MT = c(86001:86013),
    # Page 9
    CA = c(95001:95013),
    WA = c(96001:96013),
    OR = c(96014:96026),
    CA = c(96027:96039)
  ) %>%
  unlist
# map(seq_along(states), print)
states = 
  tibble(
    state = names(states),
    LAN = states
  ) %>%
  mutate(state=str_sub(state,1,2))

# Create states variable in nfs data set
nfs =
  nfs %>%
  left_join(states, by=c("LAN"="LAN"))


# Function for converting dates
# Deals with special codes
DateConv = function(var){
  var = ifelse(var>=997, NA, var)
  if_else(var<900,
          ymd("1899-12-01") + months(var),
          ymd(paste(var %% 100 + 1900, "06", "01", sep="-"))
  )
}

nfs$BornDate = DateConv(nfs$DOB)
nfs$BornYear = year(nfs$BornDate)
nfs$BornMonth = month(nfs$BornDate)

# Interview date
nfs$IntMonth = (nfs$IntDate - nfs$IntDate%%100)/100
nfs$IntDay = nfs$IntDate%%100
nfs$IntDay[nfs$IntDay==99] = 1
nfs$IntDate = ifelse(nfs$IntMonth<3, 
                     paste(1970,nfs$IntMonth+10, nfs$IntDay, sep="-"), 
                     paste(1971,nfs$IntMonth-2,  nfs$IntDay, sep="-"))
nfs$IntDate = ymd(nfs$IntDate)


states = tibble(
  StateCode = state.abb,
  State = state.name)

nfs = 
  nfs %>%
  rename(StateCode=state) %>%
  left_join(states, by="StateCode")

# nfs = 
#   nfs %>%
#   mutate(
#     # YC18_BGDB = ifelse(BornDate+years(18)>YCDate_BGDB, 1, 0),
#     #      YC19_BGDB = ifelse(BornDate+years(19)>YCDate_BGDB, 1, 0),
#     #      YC20_BGDB = ifelse(BornDate+years(20)>YCDate_BGDB, 1, 0),
#     #      YC21_BGDB = ifelse(BornDate+years(21)>YCDate_BGDB, 1, 0),
#          # YC18_Myers = ifelse(BornDate+years(18)>YCDate, 1, 0),
#          # YC19_Myers = ifelse(BornDate+years(19)>YCDate, 1, 0),
#          # YC20_Myers = ifelse(BornDate+years(20)>YCDate, 1, 0),
#          # YC21_Myers = ifelse(BornDate+years(21)>YCDate, 1, 0),
#          # YC18_BGDB_Year = ifelse(BornYear+18>YCYear, 1, 0),
#          # YC19_BGDB_Year = ifelse(BornYear+19>YCYear, 1, 0),
#          # YC20_BGDB_Year = ifelse(BornYear+20>YCYear, 1, 0),
#          # YC21_BGDB_Year = ifelse(BornYear+21>YCYear, 1, 0),
#          YC18_BHM = ifelse(BornYear+18>BHM2012, 1, 0),
#          YC19_BHM = ifelse(BornYear+19>BHM2012, 1, 0),
#          YC20_BHM = ifelse(BornYear+20>BHM2012, 1, 0),
#          YC21_BHM = ifelse(BornYear+21>BHM2012, 1, 0)
#          # Legacy code:
#          # YC18 = ifelse(BornYear+18>BHM2012, 1, 0),
#          # YC19 = ifelse(BornYear+19>BHM2012, 1, 0),
#          # YC20 = ifelse(BornYear+20>BHM2012, 1, 0),
#          # YC21 = ifelse(BornYear+21>BHM2012, 1, 0)
#          )


# Has respondent ever taken the pill
# 1: yes
# 2: no
# 9: No answer
# table(nfs$PillEver)
nfs$PillEver = ifelse(nfs$PillEver<9, 2-nfs$PillEver, NA)


# Pill use dates
# Pill use by quarter and year
# There are separate variables for each quarter of each year that contain 
# numbers that indicate categories of pill use. 
# 0 is "not used". 
# 1, 2, and 3 indicate use only in the first, second, or third month. 
# 4, 5, and 6 indicate use in combinations of two months. 
# 7 indicates use in all three months. 
# I want to turn this into monthly panels with one variable to indicate use.
# The variables I have are named things like PillY60Q1, 
# indicating that the variable reports birth control pill use in Q1 of 1960.
PillNames = str_subset(names(nfs), "PillY")

PillReplace = function(var){
  ifelse(nfs$PillEver==0 & !is.na(nfs$PillEver), 0, var)
}
nfs = 
  nfs %>%
  mutate_at(vars(PillNames), PillReplace)


pill = 
  nfs %>%
  filter(!is.na(nfs$PillEver)) %>%
  select(id, PillEver, PillNames)%>%
  gather(key=QuarterYear, value=PillUseCode, PillNames) %>%
  mutate(Year = as.integer(substr(QuarterYear, 6, 7))+1900L,
         Quarter = as.integer(substr(QuarterYear, 9, 9)),
         PillUseCode = ifelse(PillEver==0, 0, PillUseCode))
# 8 means no month was given. 
# Using in every month is the most common case, 
# so I assign 8s to use in every month.
pill$PillUsedMonth1 = as.integer(ifelse(pill$PillUseCode%in%c(1,4,5,7,8), 1, 0))
pill$PillUsedMonth2 = as.integer(ifelse(pill$PillUseCode%in%c(2,4,6,7,8), 1, 0))
pill$PillUsedMonth3 = as.integer(ifelse(pill$PillUseCode%in%c(3,5,6,7,8), 1, 0))

nfspanel = 
  pill %>%
  gather(key=MonthOfQuarter, value=Pill, PillUsedMonth1:PillUsedMonth3) %>%
  mutate(MonthOfQuarter = as.integer(as.numeric(substr(MonthOfQuarter, 14, 14))),
         Month = as.integer((Quarter-1)*3+MonthOfQuarter))

nfspanel = 
  nfspanel %>% 
  mutate(Date = ymd(paste(Year, Month, "01", sep="-"))) %>%
  arrange(id, Year, Quarter, MonthOfQuarter) %>% 
  select(-QuarterYear, -PillUseCode)

nfspanel = 
  nfspanel %>%
  left_join(nfs %>% 
              select(id, State, StateCode, BornDate, BornYear, BornMonth, IntDate, PillEver)) %>%
  mutate(AgeMonths = (Year-BornYear)*12 + Month-BornMonth,
         Age = AgeMonths/12,
         AgeYrRound = floor(Age))
nfspanel$AgeGroup = NA 
nfspanel$AgeGroup[nfspanel$Age>=12 & nfspanel$Age<16] = "12-15"
nfspanel$AgeGroup[nfspanel$Age>=16 & nfspanel$Age<18] = "16-17"
nfspanel$AgeGroup[nfspanel$Age>=18 & nfspanel$Age<21] = "18-20"
nfspanel$AgeGroup[nfspanel$Age>=21 & nfspanel$Age<31] = "21-30"
nfspanel$AgeGroup[nfspanel$Age>=31 & nfspanel$Age<41] = "31-40"


# Pill use variables
nfspanel = 
  nfspanel %>%
  group_by(id) %>%
  mutate(PillBefore17 = ifelse(max(Pill[Age<17])==-Inf, 0, max(Pill[Age<17])),
         PillBefore18 = ifelse(max(Pill[Age<18])==-Inf, 0, max(Pill[Age<18])),
         PillBefore19 = ifelse(max(Pill[Age<19])==-Inf, 0, max(Pill[Age<19])),
         PillBefore20 = ifelse(max(Pill[Age<20])==-Inf, 0, max(Pill[Age<20])),
         PillBefore21 = ifelse(max(Pill[Age<21])==-Inf, 0, max(Pill[Age<21])),
         PillBefore22 = ifelse(max(Pill[Age<22])==-Inf, 0, max(Pill[Age<22])),
         PillBefore23 = ifelse(max(Pill[Age<23])==-Inf, 0, max(Pill[Age<23])),
         AgeAtFirstPill = ifelse(max(Pill)==1, min(Age[Pill==1]), NA),
         AgeAtFirstPillCensored = ifelse(max(Pill)==1, min(Age[Pill==1]), 99),
         FirstPillDate = if_else(max(Pill)==1, min(Date[Pill==1]), as.Date(NA)),
         FirstPillDateCensored = if_else(max(Pill)==1, min(Date[Pill==1]), ymd("2000-01-01")),
         FirstPill = ifelse(Age==AgeAtFirstPill, 1, 0)) %>%
  ungroup %>%
  mutate(AgeMonths = round(AgeMonths),
         AgeMonthsStart = AgeMonths-1L,
         StateFactor = factor(State),
         BornFactor = factor(BornYear),
         AgeFactor = factor(round(Age)))

# Put the pill use vars back into nfs
nfs = 
  nfs %>%
  left_join(nfspanel %>% 
              select(id, PillBefore17:FirstPillDateCensored) %>% 
              group_by(id) %>%
              filter(dplyr::row_number()==1) %>%
              ungroup) %>%
  mutate(StateFactor = factor(StateCode),
         BornFactor = factor(BornYear))


rm(states, pill)
rm(DateConv, PillReplace, read_colindex, PillNames, nfs_names, nfs_starts, nfsus)

save.image("nfs.RData")
